Maverik is interested in producing more accurate financial plans and
initial ROI documents for future convenience store locations.
Considerable uncertainty is associated with deploying new locations in
their network and being able to properly allocate resources and
accurately predict profitability is crucial to the prosperity of their
business. To this end, this project aims to augment veracity of
financial plans and ROI documents by leveraging the store-level time
series and qualitative data collected by Maverik. This will be done by
employing an ensemble of forecasting and supervised regression models
designed to provide daily store level sales forecasts of multiple key
product categories. Success of this project will be benchmarked against
Maverik’s existing Naive forecasting solution and the proposed model
will be tuned to minimize:
Key abilities of the final must include:
The ability to accurately forecast store sales will enable Maverik to optimize the expenditure of scarce resources by pursuing the most profitable locations and minimizing misallocation of said resources when opening a new store. This will lead to better investment, decreased waste, and more reliable financial evaluations.
As well as gaining a greater holistic understanding of the provided
data, this EDA aims to answer the following questions:
# Load packages
library(lemon)
library(skimr)
library(lubridate)
library(magrittr)
library(zoo)
library(tidyverse)
library(readxl)
library(scales)
library(GGally)
library(ggrepel)
library(ggforce)
library(caret)
library(ggTimeSeries)
library(ggpointdensity)
library(fpp3)
library(patchwork)
library(plotly)
# Load data
mvts <- read_csv("../../../data/time_series_data_msba.csv") %>%
# removing unnamed row index column
select(-1) %>%
# simplifying column names
rename_with(~str_split_1(paste0("open_date,date,week_id,day_name,holiday,",
"day_type,inside_sales,food_service,diesel,",
"unleaded,site_id"), ",")) %>%
# set site_id as first column
relocate(site_id) %>%
arrange(site_id, date)
mvq <- read_csv("../../../data/qualitative_data_msba.csv") %>%
# removing unnamed row index column
# `RV Lanes Fueling Positions` and `Hi-Flow Lanes Fueling Positions` are duplicated columns
select(-c(1, `RV Lanes Fueling Positions`, `Hi-Flow Lanes Fueling Positions`)) %>%
# set site_id as first column
select(site_id = site_id_msba, colnames(.)) %>%
# simplify column names
rename_with(\(x){
# replace spaces, slashes, and hyphens with underscores
gsub(" +|-|/", "_", tolower(x)) %>%
# remove single quotes and apostrophes
gsub("'|’", "", .) %>%
# validate variables beginning with numbers
gsub("^(\\d)", "x\\1", .)
}) %>%
# distinguish from variable found in time series data
rename(has_diesel = diesel)## ── Data Summary ────────────────────────
## Values
## Name Piped data
## Number of rows 13908
## Number of columns 11
## _______________________
## Column type frequency:
## character 4
## Date 2
## numeric 5
## ________________________
## Group variables None
##
## ── Variable type: character ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## skim_variable n_missing complete_rate min max empty n_unique whitespace
## 1 site_id 0 1 5 5 0 38 0
## 2 day_name 0 1 6 9 0 7 0
## 3 holiday 0 1 4 22 0 26 0
## 4 day_type 0 1 7 7 0 2 0
##
## ── Variable type: Date ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## skim_variable n_missing complete_rate min max median n_unique
## 1 open_date 0 1 2021-01-12 2022-08-16 2021-10-08 32
## 2 date 0 1 2021-01-12 2023-08-16 2022-04-18 947
##
## ── Variable type: numeric ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
## 1 week_id 0 1 26.5 15.0 1 14 26 39 52 ▇▇▇▇▇
## 2 inside_sales 0 1 2847. 981. 0 2181. 2694. 3325. 7172. ▁▇▅▁▁
## 3 food_service 0 1 760. 342. 0 521. 697. 924. 2532. ▃▇▂▁▁
## 4 diesel 0 1 1703. 2161. 0 383. 1019. 2283. 20854. ▇▁▁▁▁
## 5 unleaded 0 1 2382. 1026. 240. 1654. 2257. 2928. 8077. ▅▇▂▁▁
The time series data contains the daily, store level sales of the
four target variables: inside_sales,
food_service, diesel, and
unleaded. Other data pertaining to dates are provided
herein, including:
week_id: Fiscal week numberday_name: Day of the week nameopen_date: Date the store openedholiday: What holiday (if any) occured on that
dayday_type: Either “WEEKDAY” or “WEEKEND”None of the variables have any missing values that will have to be dealt with.
## ── Data Summary ────────────────────────
## Values
## Name Piped data
## Number of rows 37
## Number of columns 52
## _______________________
## Column type frequency:
## character 28
## numeric 24
## ________________________
## Group variables None
##
## ── Variable type: character ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## skim_variable n_missing complete_rate min max empty n_unique whitespace
## 1 site_id 0 1 5 5 0 37 0
## 2 lottery 0 1 2 3 0 2 0
## 3 freal 0 1 2 3 0 2 0
## 4 bonfire_grill 0 1 2 3 0 2 0
## 5 pizza 0 1 2 3 0 2 0
## 6 cinnabon 0 1 2 3 0 2 0
## 7 godfathers_pizza 0 1 2 2 0 1 0
## 8 ethanol_free 0 1 2 3 0 2 0
## 9 has_diesel 0 1 3 3 0 1 0
## 10 hi_flow_lanes 0 1 2 3 0 2 0
## 11 rv_lanes 0 1 2 3 0 2 0
## 12 hi_flow_rv_lanes 0 1 2 3 0 2 0
## 13 def 0 1 2 3 0 2 0
## 14 cat_scales 0 1 2 3 0 2 0
## 15 car_wash 0 1 2 2 0 1 0
## 16 ev_charging 0 1 2 2 0 1 0
## 17 rv_dumps 0 1 2 3 0 2 0
## 18 propane 0 1 2 3 0 2 0
## 19 traditional_forecourt_layout 0 1 5 7 0 2 0
## 20 traditional_forecourt_stack_type 0 1 4 11 0 3 0
## 21 rv_lanes_layout 0 1 3 7 0 3 0
## 22 rv_lanes_stack_type 0 1 3 5 0 3 0
## 23 hi_flow_lanes_layout 0 1 3 5 0 3 0
## 24 hi_flow_lanes_stack_type 0 1 3 5 0 2 0
## 25 hi_flow_rv_lanes_layout 0 1 3 7 0 4 0
## 26 hi_flow_rv_lanes_stack_type 0 1 3 5 0 3 0
## 27 non_24_hour 0 1 2 2 0 1 0
## 28 self_check_out 0 1 3 3 0 1 0
##
## ── Variable type: numeric ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
## 1 open_year 0 1 2021. 0.475 2021 2021 2021 2022 2022 ▇▁▁▁▃
## 2 square_feet 0 1 4970. 576. 2933 5046 5046 5046 6134 ▁▁▁▇▁
## 3 front_door_count 0 1 2 0 2 2 2 2 2 ▁▁▇▁▁
## 4 years_since_last_project 0 1 1.65 0.484 1 1 2 2 2 ▅▁▁▁▇
## 5 parking_spaces 0 1 37.4 5.92 23 34 38 41 49 ▂▂▇▆▃
## 6 x1_mile_pop 0 1 6704. 5694. 0 1984 5574 11269 18692 ▇▃▃▃▂
## 7 x1_mile_emp 0 1 4758. 4697. 56 1771 3895 6002 26077 ▇▃▁▁▁
## 8 x1_mile_income 0 1 53300. 24333. 0 39538 46356 73519 110957 ▁▇▅▅▂
## 9 x1_2_mile_pop 0 1 1833. 1915. 0 262 1003 2686 5923 ▇▃▁▁▂
## 10 x1_2_mile_emp 0 1 1514. 2489. 31 386 1037 1616 15403 ▇▁▁▁▁
## 11 x1_2_mile_income 0 1 47012. 28092. 0 28185 44706 69253 104730 ▃▇▅▆▂
## 12 x5_min_pop 0 1 14529. 13419. 0 4725 12789 20792 55385 ▇▅▂▁▁
## 13 x5_min_emp 0 1 9122. 8657. 456 3131 6348 11479 34199 ▇▂▂▁▁
## 14 x5_min_inc 0 1 55292. 24372. 0 41009 50232 74911 114281 ▁▇▇▅▂
## 15 x7_min_pop 0 1 32304. 30875. 688 10672 24597 44247 137979 ▇▂▂▁▁
## 16 x7_min_emp 0 1 19224. 20477. 615 6870 11584 26238 83985 ▇▂▁▁▁
## 17 x7_min_inc 0 1 59850. 18967. 31540 48170 53805 77818 108534 ▅▇▁▃▁
## 18 traditional_forecourt_fueling_positions 0 1 14.3 3.95 10 12 12 16 24 ▇▂▁▂▁
## 19 hi_flow_lanes_fueling_positions 0 1 3.32 2.93 0 0 5 5 9 ▇▁▇▃▁
## 20 rv_lanes_fueling_positions 0 1 2.51 2.05 0 0 4 4 6 ▆▁▁▇▁
## 21 mens_toilet_count 0 1 2.38 0.924 0 2 2 3 5 ▂▇▇▁▁
## 22 mens_urinal_count 0 1 2.35 0.857 0 2 2 3 5 ▂▇▇▁▁
## 23 womens_toilet_count 0 1 4.65 1.75 0 4 4 6 10 ▂▇▇▁▁
## 24 womens_sink_count 0 1 1.70 0.740 0 1 2 2 4 ▁▅▇▁▁
The qualitative data set offers a mix of 52 categorical and numerical variables describing site attributes and demographic/socioeconomic statistics of the surrounding area. Interestingly, site #23065 is missing from this data set and will have to be removed from the time series data set.
## [1] 23065
mvq does not have any explicitly NA values,
but does utilize character string to indicate such. Maverik has
indicated that “N/A”, “None”, or any analogous value indicates the
absence of that attribute at the store rather than missing data. In this
case, it would be best to apply a single value for these cases instead
of using explicit NA and removing.
mvq %>%
# select columns containing "N/A" or "none"
select(where(~any(grepl("^N/?A$|^none$", ., ignore.case = TRUE)))) %>%
# for each column, calculate proportion of observations containing "N/A" or "none"
summarise(
across(
everything(),
~sum(grepl("^N/?A$|^none$", ., ignore.case = TRUE)) / n()
)
) %>%
# Covert to long-form
pivot_longer(everything(),
names_to = "col",
values_to = "prop_na")## # A tibble: 7 × 2
## col prop_na
## <chr> <dbl>
## 1 traditional_forecourt_stack_type 0.730
## 2 rv_lanes_layout 0.378
## 3 rv_lanes_stack_type 0.405
## 4 hi_flow_lanes_layout 0.405
## 5 hi_flow_lanes_stack_type 0.405
## 6 hi_flow_rv_lanes_layout 0.378
## 7 hi_flow_rv_lanes_stack_type 0.405
This can be achieved with the following:
mvq1 <- mvq %>%
mutate(
across(
where(~any(grepl("^N/?A$", ., ignore.case = TRUE))),
~replace(., grepl("^N/?A$", ., ignore.case = TRUE), "None")
)
)
# Confirm same proportions as before
mvq1 %>%
# select columns containing missing values
select(where(~any(. == "None"))) %>%
# for each column, calculate proportion of missing values
summarise(
across(
everything(),
~sum(. == "None") / n()
)
) %>%
#convert to long-form
pivot_longer(everything(),
names_to = "col",
values_to = "prop_na")## # A tibble: 7 × 2
## col prop_na
## <chr> <dbl>
## 1 traditional_forecourt_stack_type 0.730
## 2 rv_lanes_layout 0.378
## 3 rv_lanes_stack_type 0.405
## 4 hi_flow_lanes_layout 0.405
## 5 hi_flow_lanes_stack_type 0.405
## 6 hi_flow_rv_lanes_layout 0.378
## 7 hi_flow_rv_lanes_stack_type 0.405
mvq also contains some zero-variance variables which
would not contribute to any model and should be taken out.
mvq1 %>%
summarise(
across(
everything(),
list(
# for each column, calculate number of distinct values
ndistinct = ~n_distinct(., na.rm = TRUE),
# for each column, calculate variance
var = ~var(., na.rm = TRUE) %>% round(4),
# for each column, concatenate up to three unique values
samp = ~paste0(na.omit(unique(.)[1:3]), collapse = ", ")
)
)
) %>%
#convert to long form
pivot_longer(everything(),
names_to = c("col", ".value"),
names_pattern = "(.*)_(.*)") %>%
arrange(ndistinct)It also appears some of the fuel lane data is contradictory and/or
redundant. Site 22015 has no high-flow RV lanes as indicated by
hi_flow_rv_lanes but somehow their high-flow RV lane layout
is “In-Line”. While a single observation can easily be handled, this
instance further brings into question the pervasiveness of such errors,
detectable or otherwise.
# Show seemingly contradictory combination
mvq1 %>%
filter(hi_flow_rv_lanes == "No",
hi_flow_rv_lanes_layout == "In-Line") %>%
select(site_id, hi_flow_rv_lanes, hi_flow_rv_lanes_layout)# Create list of desired lane variable combinations
sk_list <- list(
c(
"hi_flow_lanes",
"hi_flow_lanes_layout"
),
c(
"hi_flow_lanes_layout",
"rv_lanes"
),
c(
"rv_lanes",
"rv_lanes_layout"
),
c(
"rv_lanes_layout",
"hi_flow_rv_lanes"
),
c(
"hi_flow_rv_lanes",
"hi_flow_rv_lanes_layout"
),
c(
"hi_flow_rv_lanes_layout",
"rv_lanes_fueling_positions"
),
c(
"rv_lanes_fueling_positions",
"hi_flow_lanes_fueling_positions"
)
)
# Take each value of list and perform transformations within a data frame
sk <- lapply(sk_list,
\(x){
mvq1 %>%
# ensure compatibility across variables when changing to long form
mutate(across(everything(), as.character)) %>%
# count instances of value combinations
count(
pick(all_of(x))
) %>%
# shorten column names for cleaner output
rename_with(~gsub("_lanes|_fueling", "", .)) %>%
# create row ids
mutate(id = row_number(),
.before = 1) %>%
# convert to long form
pivot_longer(-c(n, id)) %>%
# combine variable name and variable value into single column
unite("name",
name:value,
sep = ":\n") %>%
group_by(id) %>%
# place corresponding pair into different column
mutate(name2 = name[2]) %>%
ungroup() %>%
# remove duplicates
filter(name != name2)
}) %>%
# combine list output into singe data frame
list_rbind() %>%
# create required positional ids for sankey plot
mutate(from_id = match(name, unique(c(name, name2))) - 1,
to_id = match(name2, unique(c(name, name2))) - 1)
# create sankey plot
plot_ly(
type = "sankey",
# ensure plot covers entire space
domain = list(
x = c(0,1),
y = c(0,1)
),
orientation = "h",
node = list(
# set spacing between nodes
pad = 15,
thickness = 20,
line = list(
color = "black",
width = 0.5
),
label = unique(c(sk$name, sk$name2))
),
# Define paths
link = list(
source = sk$from_id,
target = sk$to_id,
value = sk$n,
# color concerning combinations
color = c(rep("#bbbbbbaa", 13), "#ff8888", rep("#bbbbbbaa", 16))
)
) %>%
layout(
title = "Distribution of fuel lane descriptions",
font = list(
size = 10
)
)Furthermore, there are only two unique combinations of
rv_lanes_stack_type and
hi_flow_rv_lanes_stack_type: either both “None” or both
“HF/RV”. It seems extremely likely to me that “HF/RV” stands for “Hi(gh)
Flow / Recreational Vehicle”.
If all of the following are true:
then it is necessarily true that all RV lanes in this data set are hi-flow. Site 22015 is the only observation which violates this condition. With so few sites to begin with, I’m hesitant to remove site 22015 from the data and instead would prefer to manually correct the data. A final decision will be made before modeling. In any case, the lane/pump data can be simplified to remove redundant data. This will also occur at a later time before modeling.
# Applying all basic cleaning steps and assigning to original object name
mvts <- read_csv("../../../data/time_series_data_msba.csv") %>%
# removing unnamed row index column
select(-1) %>%
# simplifying column names
rename_with(~str_split_1(paste0("open_date,date,week_id,day_name,holiday,",
"day_type,inside_sales,food_service,diesel,",
"unleaded,site_id"), ",")) %>%
# set site_id as first column
relocate(site_id) %>%
arrange(site_id, date) %>%
# removing store not found in `mvq`
filter(site_id != 23065)
mvq <- read_csv("../../../data/qualitative_data_msba.csv") %>%
# removing unnamed row index column
# `RV Lanes Fueling Positions` and `Hi-Flow Lanes Fueling Positions` are duplicated columns
select(-c(1, `RV Lanes Fueling Positions`, `Hi-Flow Lanes Fueling Positions`)) %>%
# set site_id as first column
select(site_id = site_id_msba, colnames(.)) %>%
# simplify column names
rename_with(\(x){
# replace spaces, slashes, and hyphens with underscores
gsub(" +|-|/", "_", tolower(x)) %>%
# remove single quotes and apostrophes
gsub("'|’", "", .) %>%
# validate variables beginning with numbers
gsub("^(\\d)", "x\\1", .)
}) %>%
# distinguish from variable found in time series data
rename(has_diesel = diesel) %>%
# creating explicit NA's
mutate(
across(
where(~any(grepl("^N/?A$", ., ignore.case = TRUE))),
~replace(., grepl("^N/?A$", ., ignore.case = TRUE), "None")
)
) %>%
# removing zero-variance variables
select(-c(c(front_door_count, godfathers_pizza, has_diesel,
car_wash, ev_charging, non_24_hour, self_check_out)))The entire data set spans from 2021-01-12 to 2023-08-16 and all 38 stores are present for one year and one day. Is the extra day of any analytical significance or simply an artifact of the fiscal calendar? Given that every store exists for the same length of time, it will be helpful to know the distribution of stores across dates.
# mvts %>%
# group_by(date) %>%
# # count of stores for each date
# summarise(n = n()) %>%
# ggplot() +
# geom_col(aes(date, n),
# fill = "darkred", color = "darkred") +
# # Fix date axis labels
# scale_x_date(breaks = seq(as_date("2021-01-01"), as_date("2023-09-01"), "3 months"),
# labels = ~ifelse(month(.) == 1, format(., "%Y"), format(., "%b"))) +
# scale_y_continuous(breaks = seq(0, 30, 5)) +
# theme_minimal(20) +
# labs(title = "Count of stores for each date")
# Bar plot
{
mvts %>%
# count of stores for each date
count(date) %>%
ggplot() +
geom_col(aes(date, n),
fill = "darkred", alpha = 0.7, width = 1) +
# Fix date axis labels
scale_x_date(breaks = seq(as_date("2021-01-01"), as_date("2023-09-01"), "3 months"),
labels = ~ifelse(month(.) == 1, format(., "%Y"), format(., "%b"))) +
scale_y_continuous(breaks = seq(0, 30, 5)) +
theme_minimal(20) +
labs(title = "Count of stores vs date")
} / # vertically combining plots with `patchwork` package
# Line plot
{
mvts %>%
arrange(open_date, site_id, date) %>%
# rescale site_id for plot
mutate(site_id = consecutive_id(site_id)) %>%
ggplot() +
geom_line(aes(date, site_id, group = site_id),
color = "steelblue", linewidth = 2, show.legend = FALSE) +
# Fix date axis labels
scale_x_date(breaks = seq(as_date("2021-01-01"), as_date("2023-09-01"), "3 months"),
labels = ~ifelse(month(.) == 1, format(., "%Y"), format(., "%b"))) +
scale_y_continuous(breaks = seq(0, 38, 4),
minor_breaks = NULL) +
theme_minimal(20) +
labs(title = "Store-wise date range",
y = "Cumulative store count")
}Given that open_date is not uniformly distributed,
network-wide seasonality will have to be calculated either on a sales
per store basis or by standardizing the date by attributing a day ID
similar to week_id. Maverik expressed the importance of
aligning days in a standardized manner, which is why
week_id is included. I’ll begin dissecting the fiscal
calendar structure by determining if a singular day of the week begins
each week.
mvts %>%
distinct(date, .keep_all = TRUE) %>%
# since week_id resets each year, values are not unique to distinct weeks
group_by(unique_week_id = consecutive_id(week_id)) %>%
# remove incomplete weeks
mutate(unique_week_len = n()) %>%
filter(unique_week_len == 7) %>%
# subset first day of fiscal week
summarise_all(first) %>%
# determine variety
count(day_name)## # A tibble: 1 × 2
## day_name n
## <chr> <int>
## 1 Friday 134
Now that we know each (complete) week begins on Friday, we can begin constructing a standardized day_id.
# Begin with completed date range found in data
day_id_df <- tibble(date = seq(as_date("2021-01-01"), as_date("2023-12-31"), "1 day")) %>%
# Calculate week_id
mutate(week_id = yearweek(date, week_start = 5) %>% format("%V") %>% as.numeric(),
# since the first day of fiscal year 2022 is actually in 2021, special logic must be
# applied to identify the beginning of the year
x = case_when(lag(week_id, default = 52) == 52 & week_id == 1 ~ 1),
year = 2020 + rollapplyr(x, width = n(), FUN = sum, na.rm = TRUE, partial = TRUE)) %>%
group_by(year) %>%
mutate(day_id = row_number()) %>%
select(-x) %>%
ungroup()
day_id_df# Validating accuracy by comparing to mvts' week_id
day_id_df %>%
# trimming to mvts' date range
filter(date >= "2021-01-12",
date <= "2023-08-16") %>%
# counting instances of manually calculated week_id and renaming columns
count(custom_week_id = week_id,
name = "custom_n") %>%
bind_cols(mvts %>%
distinct(date, week_id) %>%
# counting instances of native week_id and renaming columns
count(mvts_week_id = week_id,
name = "mvts_n")) %>%
# subsetting discrepant rows
filter(custom_n != mvts_n)mvts %>%
# Aggregate sales
summarise(across(inside_sales:unleaded, sum)) %>%
# Convert to long-form
pivot_longer(everything(),
names_to = "metric",
values_to = "sales") %>%
# get percentage of sales by sales metric
mutate(p = percent(sales/sum(sales), 0.1))mvts %>%
# aggregate sales by site
group_by(site_id) %>%
summarise(across(inside_sales:unleaded, sum)) %>%
# convert to long-form
pivot_longer(inside_sales:unleaded,
names_to = "metric",
values_to = "sales") %>%
# get percentage of sales by sales metric
group_by(site_id) %>%
mutate(p = sales/sum(sales),
# create separate variable to be used in `arrange`
d = p[metric == "diesel"]) %>%
arrange(-d) %>%
ungroup() %>%
# ensure stores are arranged by % diesel in plot
mutate(site_index = consecutive_id(site_id),
metric = fct_reorder(metric, p)) %>%
ggplot(aes(site_index, p)) +
geom_col(aes(fill = metric)) +
geom_text(aes(label = percent(p, 1), group = metric),
# place label in center of `group`
position = position_stack(vjust = 0.5)) +
theme_minimal(20) +
theme(legend.position = "top",
legend.title = element_blank()) +
labs(title = "Proportion of total store sales by sales metric",
x = "site index (descending diesel contribution)",
y = NULL)# Histogram (density)
mvts %>%
pivot_longer(c(inside_sales, food_service, diesel, unleaded),
names_to = "metric",
values_to = "sales") %>%
group_by(metric, site_id) %>%
summarise(sales = sum(sales)) %>%
mutate(avg = mean(sales),
med = median(sales)) %>%
ggplot() +
geom_density(aes(sales, color = metric),
linewidth = 1) +
#coord_cartesian(xlim = c(0, 2e6)) +
theme_minimal(20) +
theme(legend.position = "top",
legend.title = element_blank()) +
labs(title = "Distribution of key sales metrics")# Violin
mvts %>%
pivot_longer(c(inside_sales, food_service, diesel, unleaded),
names_to = "metric",
values_to = "sales") %>%
group_by(metric, site_id) %>%
summarise(sales = sum(sales)) %>%
ggplot() +
geom_violin(aes(metric, sales, fill = metric),
color = "white", draw_quantiles = c(0.25, 0.5, 0.75),
scale = "area") +
theme_minimal(20) +
labs(title = "Distribution of key sales metrics")# Line
mvts %>%
# add `day_id`
left_join(day_id_df %>%
select(date, day_id), "date") %>%
# place all sales into one column, labeled by another
pivot_longer(c(inside_sales, food_service, diesel, unleaded),
names_to = "metric",
values_to = "sales") %>%
# aggregate by `day_id` and sales metric
group_by(metric, day_id) %>%
summarise(sales = sum(sales),
day_type = day_type[1]) %>%
ggplot() +
# `group = 1` prevents a separate line for each `day_type`
geom_line(aes(day_id, sales, color = day_type, group = 1)) +
# create separate plot for each `metric`
facet_rep_wrap(~metric, repeat.tick.labels = TRUE, scales = "free", ncol = 1) +
theme_minimal(20) +
theme(legend.position = "top",
legend.title = element_blank())# Line with holiday
# Categorical line color
mvts %>%
# add `day_id`
left_join(day_id_df %>%
select(date, day_id), "date") %>%
group_by(day_id) %>%
mutate(is_holiday = case_when(holiday != "NONE" ~ "holiday",
.default = "regular day")) %>%
# place all sales into one column, labeled by another
pivot_longer(c(inside_sales, food_service, diesel, unleaded),
names_to = "metric",
values_to = "sales") %>%
# aggregate by `day_id` and sales metric
group_by(metric, day_id) %>%
summarise(sales = sum(sales),
is_holiday = is_holiday[1]) %>%
ggplot() +
# `group = 1` prevents a separate line for each `is_holiday`
geom_line(aes(day_id, sales, color = is_holiday, group = 1),
linewidth = 1) +
# Create generalized date labels
scale_x_continuous(breaks = c(1, cumsum(days_in_month(as_date(paste0("2021-", month.abb, "-01")))) + 1),
labels = ~format(as_date("2021-12-31") + ., "%b %d")) +
# Rescale labels to thousands
scale_y_continuous(labels = ~./1e3) +
# create separate plot for each `metric`
facet_rep_wrap(~metric, repeat.tick.labels = TRUE, scales = "free", ncol = 1) +
theme_minimal(20) +
theme(legend.position = "top",
legend.title = element_blank()) +
labs(title = "Annualized store sales",
x = "generalized date",
y = "sales $ (thousands)")# Create df for plot to shade holidays
holiday_4gg <- mvts %>%
# add `day_id`
left_join(day_id_df %>%
select(date, day_id), "date") %>%
# place all sales into one column, labeled by another
pivot_longer(c(inside_sales, food_service, diesel, unleaded),
names_to = "metric",
values_to = "sales") %>%
# aggregate by `day_id` and sales metric
group_by(metric, day_id) %>%
mutate(sales = sum(sales)) %>%
# aggregate by `metric`
group_by(metric) %>%
mutate(sales_max = max(sales),
sales_min = min(sales)) %>%
# Keep only holidays
filter(holiday != "NONE") %>%
group_by(holiday, metric) %>%
# get min/max of `day_id` and `sales`
reframe(day_id = range(day_id),
sales = c(sales_min[1], sales_max[1]))
mvts %>%
# add `day_id`
left_join(day_id_df %>%
select(date, day_id), "date") %>%
# place all sales into one column, labeled by another
pivot_longer(c(inside_sales, food_service, diesel, unleaded),
names_to = "metric",
values_to = "sales") %>%
# aggregate by `day_id` and sales metric
group_by(metric, day_id) %>%
summarise(sales = sum(sales),
day_type = day_type[1]) %>%
ggplot() +
geom_mark_rect(data = holiday_4gg,
aes(day_id, sales,
group = interaction(holiday, metric)),
expand = unit(1.5, "mm"), radius = unit(0.5, "mm"),
fill = "#000000", alpha = 0.1, color = NA) +
# `group = 1` prevents a separate line for each `day_type`
geom_line(aes(day_id, sales, color = day_type, group = 1)) +
# Rescale labels to thousands
scale_y_continuous(labels = ~./1e3) +
# create separate plot for each `metric`
facet_rep_wrap(~metric, repeat.tick.labels = TRUE, scales = "free", ncol = 1) +
theme_minimal(20) +
theme(legend.position = "top",
legend.title = element_blank()) +
labs(title = "Sales vs day_id",
subtitle = "Shaded regions indicate holidays",
y = "sales $ (thousands)")# Correlation
mvts %>%
left_join(day_id_df %>%
select(date, day_id), "date") %>%
select(inside_sales, food_service, diesel, unleaded) %>%
ggpairs() +
labs(title = "Correlation of daily store sales")mvts %>%
left_join(day_id_df %>%
select(date, day_id), "date") %>%
group_by(site_id) %>%
summarise(across(c(inside_sales, food_service, diesel, unleaded), sum)) %>%
ggpairs() +
labs(title = "Correlation of annualized store sales")Based on the time series line plot, it’s clear that there is weekly and yearly seasonality but weekly seasonality is more pronounced with weekdays seeing greater sales than weekends. We also see that certain holidays affect sales more than others. This is not surprising but this will certainly have to be accounted for during modeling.
All of the sales metrics besides diesel are fairly
normally distributed but still right-skewed. Intuitively,
inside_sales and food_service are very
positively correlated. The relationship of the other pairs are rather
heteroskedastic or otherwise inconsistent. Determining which (if any)
features in the available data can explain this irregular variance will
be valuable when forecasting.
The scatter plot from the generalized pairs plot reveals that some
locations had no sales on at least one day. One may think that these
might be stores not open 24 hours a day, but we discovered earlier that
only one value for non-24_hour exists in the data and
subsequently all stores are open 24 hours a day.
mvts %>%
group_by(site_id) %>%
filter(if_any(c(inside_sales, food_service, diesel, unleaded), ~any(.<=0))) %>%
summarise(across(c(inside_sales, food_service, diesel, unleaded),
list(avg = ~mean(.),
med = ~median(.),
sd = ~sd(.),
max = ~max(.),
min = ~min(.),
dec1 = ~quantile(., 0.1),
daysNoSales = ~sum(. == 0)))) %>%
pivot_longer(-site_id,
names_to = c("metric", ".value"),
names_pattern = "(.*)_(.*)")We can determine that only two stores account for the zero sales values. Based on each stores’ summary statistics, the absence of sales for even a single day seems peculiar. This could be indicative of some disruption to the facilities that is not indicated in the data. It can be inferred that other stores could have possibly also experienced similar, but not complete, disruptions that would affect their sales that we cannot account for. Unfortunately, not much can be done about this and hopefully the scale of such instances are minor.
Conversely, there seems to be at least one store that has dramatically higher sales than the rest which begs the question of what to do with sufficiently outlying data. Before deciding to remove such cases, it will be important to see if the other corresponding data can suggest an explanatory pattern which is valuable for prediction.
mvts %>%
# distinguish unusual diesel seller
mutate(big_seller = site_id == 21980,
alpha = ifelse(big_seller, 1, 0.4)) %>%
arrange(big_seller) %>%
ggplot() +
geom_point(aes(inside_sales, diesel, color = big_seller, alpha = alpha)) +
# dynamic opacity
scale_alpha_identity() +
theme_minimal(20) +
theme(legend.position = "top",
legend.title = element_blank()) +
labs(title = "Diesel vs Inside Sales, spotlight on Site 21980")# Numerical features
corr_df <- mvts %>%
group_by(site_id) %>%
summarise(across(inside_sales:unleaded, sum)) %>%
left_join(mvq, "site_id") %>%
select(where(is.numeric), -c(site_id, open_year, years_since_last_project))
corr_df %>%
cor() %>%
ggcorrplot::ggcorrplot(method = "circle",
p.mat = ggcorrplot::cor_pmat(corr_df),
lab = TRUE,
lab_size = 3,
outline.color = "white",
hc.order = TRUE,
insig = "blank")# Sina plot, diesel vs women's sink, store+date level
mvts %>%
left_join(mvq, "site_id") %>%
pivot_longer(inside_sales:unleaded,
names_to = "metric",
values_to = "sales") %>%
group_by(womens_sink_count, metric) %>%
mutate(med = median(sales)) %>%
ggplot() +
geom_sina(aes(womens_sink_count, sales,
color = after_stat(scaled),
group = womens_sink_count),
size = 1, scale = "width") +
geom_linerange(data = \(x) {x %>%
summarise_all(~.[1])},
aes(y = med, xmin = womens_sink_count - 0.25, xmax = womens_sink_count + 0.25),
linetype = "dashed", linewidth = 1) +
viridis::scale_color_viridis(guide = guide_colorbar(title.position = "top",
title.hjust = 0.5)) +
scale_y_continuous(labels = ~./1000) +
facet_rep_wrap(~metric, repeat.tick.labels = TRUE, scales = "free") +
theme_minimal(20) +
theme(legend.direction = "horizontal",
legend.position = "top",
legend.key.width = unit(15, "mm")) +
labs(title = "Distribution of daily store sales sales\n given count of women's sinks",
y = "sales $ (thousands)",
color = "scaled density")mvts %>%
left_join(mvq, "site_id") %>%
mutate(total_pumps = traditional_forecourt_fueling_positions +
hi_flow_lanes_fueling_positions +
rv_lanes_fueling_positions) %>%
pivot_longer(c(contains("position"), total_pumps),
names_to = "pump_type",
values_to = "n_pumps") %>%
pivot_longer(inside_sales:unleaded,
names_to = "metric",
values_to = "sales") %>%
group_by(pump_type, n_pumps, metric) %>%
mutate(med = median(sales)) %>%
ggplot() +
geom_sina(aes(n_pumps, sales,
color = after_stat(scaled),
group = n_pumps),
size = 0.5, scale = "width") +
geom_linerange(data = \(x) {x %>%
#group_by(categ) %>%
summarise_all(~.[1])},
aes(y = med, xmin = n_pumps - 0.4, xmax = n_pumps + 0.4),
linewidth = 0.5, color = "red") +
viridis::scale_color_viridis(guide = guide_colorbar(title.position = "top",
title.hjust = 0.5)) +
scale_y_continuous(labels = ~./1000) +
facet_rep_grid(metric~pump_type, repeat.tick.labels = TRUE, scales = "free") +
theme_minimal(20) +
theme(legend.direction = "horizontal",
legend.position = "top",
legend.key.width = unit(15, "mm"),
plot.margin = unit(c(0,0,0,0), "lines")) +
labs(title = "Distribution of daily sales given count of fuel pumps",
y = "sales $ (thousands)",
color = "scaled density")lapply(c("inside_sales", "food_service", "diesel", "unleaded"),
\(x){
xdata <- mvts %>%
left_join(mvq, "site_id") %>%
mutate(across(where(is.numeric), ~scale(.)[,1])) %>%
group_by(site_id) %>%
mutate(across(inside_sales:unleaded,
list(lag = ~lag(., default = last(.))))) %>%
ungroup() %>%
select(-c(where(is.Date), site_id, open_year, years_since_last_project))
xlm <- lm(
paste(x, "~ ."),
data = xdata
) %>%
summary
out <- xlm$coefficients %>%
as.data.frame(x = .) %>%
mutate(var = row.names(.),
.before = 1) %>%
as_tibble() %>%
mutate(sig = case_when(`Pr(>|t|)` <= 0.001 ~ "***",
`Pr(>|t|)` <= 0.01 ~ "**",
`Pr(>|t|)` <= 0.05 ~ "*",
`Pr(>|t|)` <= 0.1 ~ "."),
`Pr(>|t|)` = round(`Pr(>|t|)`, 5),
r2 = xlm$r.squared,
target = x) %>%
relocate(target)
}) %>%
list_rbind() %>%
filter(!grepl("Intercept", var),
grepl("\\*", sig)) %>%
group_by(target) %>%
slice_max(Estimate, n = 5)# Simple LM (one predictor) on annualized sales
lapply(c("inside_sales", "food_service", "diesel", "unleaded"),
\(x){
xdata <- mvts %>%
group_by(site_id) %>%
summarise(across(inside_sales:unleaded, sum)) %>%
left_join(mvq, "site_id") %>%
mutate(across(where(is.numeric), ~scale(.)[,1])) %>%
ungroup() %>%
select(-c(where(is.Date), site_id, open_year, years_since_last_project))
lapply(colnames(xdata),
\(y){
xlm <- lm(
paste(x, "~", y),
data = xdata
) %>%
summary
out <- xlm$coefficients %>%
as.data.frame(x = .) %>%
mutate(var = row.names(.),
.before = 1) %>%
as_tibble() %>%
mutate(sig = case_when(`Pr(>|t|)` <= 0.001 ~ "***",
`Pr(>|t|)` <= 0.01 ~ "**",
`Pr(>|t|)` <= 0.05 ~ "*",
`Pr(>|t|)` <= 0.1 ~ "."),
`Pr(>|t|)` = round(`Pr(>|t|)`, 5),
r2 = xlm$r.squared,
target = x) %>%
relocate(target)
}) %>% list_rbind()
}) %>%
list_rbind() %>%
filter(!grepl("Intercept", var),
grepl("\\*", sig)) %>%
group_by(target) %>%
slice_max(r2, n = 5)